{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,HashingVectorizer,TfidfTransformer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "arr1 = [\n",
    "    \"This is spark, spark sql a every good\",\n",
    "    \"Spark Hadoop Hbase\",\n",
    "    \"This is sample\",\n",
    "    \"This is anthor example anthor example\",\n",
    "    \"spark hbase hadoop spark hive hbase hue oozie\",\n",
    "    \"hue oozie spark\"\n",
    "]\n",
    "arr2 = [\n",
    "    \"this is a sample a example\",\n",
    "    \"this c c cd is another another sample example example\",\n",
    "    \"spark Hbase hadoop Spark hive hbase\"\n",
    "]\n",
    "df = arr2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 0.          0.          0.5         0.          0.          0.          0.5\n",
      "   0.5         0.          0.5       ]\n",
      " [ 0.66486672  0.33243336  0.50564828  0.          0.          0.\n",
      "   0.25282414  0.25282414  0.          0.25282414]\n",
      " [ 0.          0.          0.          0.31622777  0.63245553  0.31622777\n",
      "   0.          0.          0.63245553  0.        ]]\n",
      "['another', 'cd', 'example', 'hadoop', 'hbase', 'hive', 'is', 'sample', 'spark', 'this']\n",
      "None\n",
      "转换另外的文档数据\n",
      "[[ 0.          0.          0.          0.          0.          0.\n",
      "   0.3349067   0.          0.88072413  0.3349067 ]\n",
      " [ 0.          0.          0.          0.57735027  0.57735027  0.          0.\n",
      "   0.          0.57735027  0.        ]\n",
      " [ 0.          0.          0.          0.          0.          0.\n",
      "   0.57735027  0.57735027  0.          0.57735027]\n",
      " [ 0.          0.          0.81649658  0.          0.          0.\n",
      "   0.40824829  0.          0.          0.40824829]\n",
      " [ 0.          0.          0.          0.31622777  0.63245553  0.31622777\n",
      "   0.          0.          0.63245553  0.        ]\n",
      " [ 0.          0.          0.          0.          0.          0.          0.\n",
      "   0.          1.          0.        ]]\n"
     ]
    }
   ],
   "source": [
    "tfidf = TfidfVectorizer(min_df=0, dtype=np.float64)\n",
    "df2 = tfidf.fit_transform(df)\n",
    "print (df2.toarray())\n",
    "print (tfidf.get_feature_names())\n",
    "print (tfidf.get_stop_words())\n",
    "print (\"转换另外的文档数据\")\n",
    "print (tfidf.transform(arr1).toarray())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 0.  1.  1.  0.  0.  0.  0.  1.  0.  1.]\n",
      " [ 0.  1.  1.  0.  1.  0.  2.  1.  0.  2.]\n",
      " [ 1.  2.  0.  0.  0.  0.  2.  0.  1.  0.]]\n",
      "None\n",
      "转换另外的文档数据\n",
      "[[ 0.  2.  0.  0.  0.  0.  0.  1.  1.  1.]\n",
      " [ 0.  1.  0.  0.  0.  0.  1.  0.  1.  0.]\n",
      " [ 0.  1.  1.  0.  0.  0.  0.  1.  0.  0.]\n",
      " [ 0.  0.  1.  0.  2.  0.  0.  1.  0.  2.]\n",
      " [ 1.  2.  0.  0.  1.  0.  1.  0.  1.  0.]\n",
      " [ 0.  1.  0.  0.  1.  0.  1.  0.  0.  0.]]\n"
     ]
    }
   ],
   "source": [
    "hashing = HashingVectorizer(n_features=10, non_negative=True, norm=None)\n",
    "df3 = hashing.fit_transform(df)\n",
    "print (df3.toarray())\n",
    "print (hashing.get_stop_words())\n",
    "print (\"转换另外的文档数据\")\n",
    "print (hashing.transform(arr1).toarray())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 0.  0.  1.  0.  0.  0.  1.  1.  0.  1.]\n",
      " [ 2.  1.  2.  0.  0.  0.  1.  1.  0.  1.]\n",
      " [ 0.  0.  0.  1.  2.  1.  0.  0.  2.  0.]]\n",
      "None\n",
      "['another', 'cd', 'example', 'hadoop', 'hbase', 'hive', 'is', 'sample', 'spark', 'this']\n",
      "转换另外的文档数据\n",
      "[[ 0.  0.  0.  0.  0.  0.  1.  0.  2.  1.]\n",
      " [ 0.  0.  0.  1.  1.  0.  0.  0.  1.  0.]\n",
      " [ 0.  0.  0.  0.  0.  0.  1.  1.  0.  1.]\n",
      " [ 0.  0.  2.  0.  0.  0.  1.  0.  0.  1.]\n",
      " [ 0.  0.  0.  1.  2.  1.  0.  0.  2.  0.]\n",
      " [ 0.  0.  0.  0.  0.  0.  0.  0.  1.  0.]]\n"
     ]
    }
   ],
   "source": [
    "count = CountVectorizer(min_df=0.1, dtype=np.float64, ngram_range=(0,1))\n",
    "df4 = count.fit_transform(df)\n",
    "print (df4.toarray())\n",
    "print (count.get_stop_words())\n",
    "print (count.get_feature_names())\n",
    "print (\"转换另外的文档数据\")\n",
    "print (count.transform(arr1).toarray())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 0.          0.          0.5         0.          0.          0.          0.5\n",
      "   0.5         0.          0.5       ]\n",
      " [ 0.66486672  0.33243336  0.50564828  0.          0.          0.\n",
      "   0.25282414  0.25282414  0.          0.25282414]\n",
      " [ 0.          0.          0.          0.31622777  0.63245553  0.31622777\n",
      "   0.          0.          0.63245553  0.        ]]\n",
      "转换另外的文档数据\n",
      "[[ 0.          0.          0.          0.          0.          0.\n",
      "   0.3349067   0.          0.88072413  0.3349067 ]\n",
      " [ 0.          0.          0.          0.57735027  0.57735027  0.          0.\n",
      "   0.          0.57735027  0.        ]\n",
      " [ 0.          0.          0.          0.          0.          0.\n",
      "   0.57735027  0.57735027  0.          0.57735027]\n",
      " [ 0.          0.          0.81649658  0.          0.          0.\n",
      "   0.40824829  0.          0.          0.40824829]\n",
      " [ 0.          0.          0.          0.31622777  0.63245553  0.31622777\n",
      "   0.          0.          0.63245553  0.        ]\n",
      " [ 0.          0.          0.          0.          0.          0.          0.\n",
      "   0.          1.          0.        ]]\n"
     ]
    }
   ],
   "source": [
    "tfidf2 = TfidfTransformer()\n",
    "df5 = tfidf2.fit_transform(df4)\n",
    "print (df5.toarray())\n",
    "print (\"转换另外的文档数据\")\n",
    "print (tfidf2.transform(count.transform(arr1)).toarray())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "dataset = [['my', 'dog', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\n",
    "         ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\n",
    "         ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\n",
    "         ['stop', 'posting', 'stupid', 'worthless', 'garbage'],\n",
    "         ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\n",
    "         ['quit','worthless', 'buying', 'worthless', 'dog', 'food', 'stupid']]\n",
    "vocabSet = set()\n",
    "for doc in dataset:\n",
    "    vocabSet |= set(doc)\n",
    "vocabList = list(vocabSet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "SOW = []\n",
    "for doc in dataset:\n",
    "    vec = [0]*len(vocabList)\n",
    "    for i, word in enumerate(vocabList):\n",
    "        if word in doc:\n",
    "            vec[i] = 1\n",
    "    SOW.append(vec) \n",
    "\n",
    "# 词袋模型\n",
    "BOW = []\n",
    "for doc in dataset:\n",
    "    vec = [0]*len(vocabList)\n",
    "    for word in doc:\n",
    "        vec[vocabList.index(word)] += 1\n",
    "    BOW.append(vec)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "词袋法以及词集法\n",
      "[[0 0 0 0 0 0 0 1 1 0 1 1 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1]\n",
      " [0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 0]\n",
      " [1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]\n",
      " [1 0 0 1 0 1 1 0 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 1 0 0 0]\n",
      " [0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0]]\n",
      "[[0 0 0 0 0 0 0 1 1 0 1 2 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]\n",
      " [0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1]\n",
      " [0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 0]\n",
      " [1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]\n",
      " [1 0 0 1 0 1 1 0 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 1 0 0 0]\n",
      " [0 0 0 0 2 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0]]\n"
     ]
    }
   ],
   "source": [
    "print (\"词袋法以及词集法\")\n",
    "print (np.array(SOW))\n",
    "print (np.array(BOW))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
